{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# Create list with each line containing old path followed by new path\n",
    "lines = []\n",
    "with open(\"reformat_buckets.txt\", \"r\") as f:\n",
    "    for line in f:\n",
    "        line = line.replace(\"gsutil -u broad-ctsa -m cp -r \", \"\").replace(\"*\", \"\")\n",
    "        lines.append(line)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# Create dict mapping current urls to new urls\n",
    "new_mappings = {}\n",
    "\n",
    "# GCS\n",
    "# Mappings from old_name: new_name for hail-datasets-us\n",
    "for line in lines:\n",
    "    line = line.strip().split(\" \")\n",
    "    line = [x.rstrip(\"/\") for x in line]\n",
    "    new_mappings[line[0]] = line[1]\n",
    "# Mappings from old_name: new_name for hail-datasets-eu\n",
    "for line in lines:\n",
    "    line = line.replace(\"hail-datasets-us/\", \"hail-datasets-eu/\")\n",
    "    line = line.strip().split(\" \")\n",
    "    line = [x.rstrip(\"/\") for x in line]\n",
    "    new_mappings[line[0]] = line[1]\n",
    "# AWS\n",
    "# Mappings from old_name: new_name for hail-datasets-us-east-1\n",
    "for line in lines:\n",
    "    line = line.replace(\"gs://hail-datasets-us/\", \"s3://hail-datasets-us-east-1/\")\n",
    "    line = line.strip().split(\" \")\n",
    "    line = [x.rstrip(\"/\") for x in line]\n",
    "    new_mappings[line[0]] = line[1]\n",
    "\n",
    "with open(\"reformat_buckets_mappings.json\", \"w\") as f:\n",
    "    json.dump(new_mappings, f, sort_keys=True, ensure_ascii=False, indent=2)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "# Load config file\n",
    "datasets_json_path = os.path.abspath(\"../../hail/python/hail/experimental/datasets.json\")\n",
    "with open(datasets_json_path) as f:\n",
    "    datasets_json = json.load(f)\n",
    "\n",
    "# Update urls for all datasets according to new mappings\n",
    "dataset_names = [name for name in datasets_json]\n",
    "for name in dataset_names:\n",
    "    versions = datasets_json[name][\"versions\"]\n",
    "    for version in versions:\n",
    "        if \"aws\" in version[\"url\"]:\n",
    "            if version[\"url\"][\"aws\"][\"us\"] in new_mappings.keys():\n",
    "                version[\"url\"][\"aws\"][\"us\"] = new_mappings[version[\"url\"][\"aws\"][\"us\"]]\n",
    "        if \"gcp\" in version[\"url\"]:\n",
    "            if \"us\" in version[\"url\"][\"gcp\"]:\n",
    "                if version[\"url\"][\"gcp\"][\"us\"] in new_mappings.keys():\n",
    "                    version[\"url\"][\"gcp\"][\"us\"] = new_mappings[version[\"url\"][\"gcp\"][\"us\"]]\n",
    "            if \"eu\" in version[\"url\"][\"gcp\"]:\n",
    "                if version[\"url\"][\"gcp\"][\"eu\"] in new_mappings.keys():\n",
    "                    version[\"url\"][\"gcp\"][\"eu\"] = new_mappings[version[\"url\"][\"gcp\"][\"eu\"]]\n",
    "    # Update GTEx names while we're at it\n",
    "    if \"GTEx_eQTL\" in name or \"GTEx_sQTL\" in name:\n",
    "        tissue = name.split(\"_\", 3)[-1]\n",
    "        qtl = name.split(\"_\", 3)[1]\n",
    "        updated_name = f\"GTEx_{qtl}_{tissue}_all_snp_gene_associations\"\n",
    "        datasets_json[updated_name] = datasets_json[name]\n",
    "        del datasets_json[name]\n",
    "\n",
    "# Write new entries to config file\n",
    "with open(datasets_json_path, \"w\") as f:\n",
    "    json.dump(datasets_json, f, sort_keys=True, ensure_ascii=False, indent=2)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import hail as hl\n",
    "hl.init(spark_conf={\"spark.hadoop.fs.s3a.aws.credentials.provider\":\n",
    "                        \"org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider\"})\n",
    "\n",
    "# Test that we can load datasets from GCS and AWS\n",
    "datasets_json_path = os.path.abspath(\"../../hail/python/hail/experimental/datasets.json\")\n",
    "with open(datasets_json_path) as f:\n",
    "    datasets_json = json.load(f)\n",
    "\n",
    "dataset_names = [name for name in datasets_json]\n",
    "for name in dataset_names:\n",
    "    print(name)\n",
    "    versions = datasets_json[name][\"versions\"]\n",
    "    for version in versions:\n",
    "        if \"gcp\" in version[\"url\"]:\n",
    "            if \"us\" in version[\"url\"][\"gcp\"]:\n",
    "                url = version[\"url\"][\"gcp\"][\"us\"]\n",
    "                print(url)\n",
    "                if url.endswith(\".ht\"):\n",
    "                    hl.read_table(url)\n",
    "                elif url.endswith(\".mt\"):\n",
    "                    hl.read_matrix_table(url)\n",
    "                else:\n",
    "                    hl.linalg.BlockMatrix.read(url)\n",
    "            if \"eu\" in version[\"url\"][\"gcp\"]:\n",
    "                url = version[\"url\"][\"gcp\"][\"eu\"]\n",
    "                print(url)\n",
    "                if url.endswith(\".ht\"):\n",
    "                    hl.read_table(url)\n",
    "                elif url.endswith(\".mt\"):\n",
    "                    hl.read_matrix_table(url)\n",
    "                else:\n",
    "                    hl.linalg.BlockMatrix.read(url)\n",
    "        if \"aws\" in version[\"url\"]:\n",
    "            url = version[\"url\"][\"aws\"][\"us\"].replace(\"s3://\", \"s3a://\")\n",
    "            print(url)\n",
    "            if url.endswith(\".ht\"):\n",
    "                hl.read_table(url)\n",
    "            elif url.endswith(\".mt\"):\n",
    "                hl.read_matrix_table(url)\n",
    "            else:\n",
    "                hl.linalg.BlockMatrix.read(url)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}